Data Cleaning

boston_cocktails <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-26/boston_cocktails.csv")

boston_cocktails %>%
  count(ingredient, sort = TRUE)
## # A tibble: 569 x 2
##    ingredient            n
##    <chr>             <int>
##  1 Gin                 176
##  2 Fresh lemon juice   138
##  3 Simple Syrup        115
##  4 Vodka               114
##  5 Light Rum           113
##  6 Dry Vermouth        107
##  7 Fresh Lime Juice    107
##  8 Triple Sec          107
##  9 Powdered Sugar       90
## 10 Grenadine            85
## # … with 559 more rows
cocktails_parsed <- boston_cocktails %>%
  mutate(
    ingredient = str_to_lower(ingredient),  # get lower case
    ingredient = str_replace_all(ingredient, "-", " "),  # replace all dashes with spaces 
    ingredient = str_remove(ingredient, " liqueur$"), 
    ingredient = str_remove(ingredient, " (if desired)$"),
    ingredient = case_when(
      str_detect(ingredient, "bitters") ~ "bitters",
      str_detect(ingredient, "lemon") ~ "lemon juice",
      str_detect(ingredient, "lime") ~ "lime juice",
      str_detect(ingredient, "grapefruit") ~ "grapefruit juice",
      str_detect(ingredient, "orange") ~ "orange juice",
      TRUE ~ ingredient
    ),
    measure = case_when(
      str_detect(ingredient, "bitters") ~ str_replace(measure, "oz$", "dash"),
      TRUE ~ measure
    ),
    measure = str_replace(measure, " ?1/2", ".5"),  # change 1 1/2 oz to 1.5 oz
    measure = str_replace(measure, " ?3/4", ".75"),
    measure = str_replace(measure, " ?1/4", ".25"),
    measure_number = parse_number(measure),  # get out the number only
    measure_number = if_else(str_detect(measure, "dash$"),
      measure_number / 50,
      measure_number
    )
  )

cocktails_parsed
## # A tibble: 3,643 x 7
##    name    category   row_id ingredient_numb… ingredient  measure measure_number
##    <chr>   <chr>       <dbl>            <dbl> <chr>       <chr>            <dbl>
##  1 Gauguin Cocktail …      1                1 light rum   2 oz              2   
##  2 Gauguin Cocktail …      1                2 passion fr… 1 oz              1   
##  3 Gauguin Cocktail …      1                3 lemon juice 1 oz              1   
##  4 Gauguin Cocktail …      1                4 lime juice  1 oz              1   
##  5 Fort L… Cocktail …      2                1 light rum   1.5 oz            1.5 
##  6 Fort L… Cocktail …      2                2 sweet verm… .5 oz             0.5 
##  7 Fort L… Cocktail …      2                3 orange jui… .25 oz            0.25
##  8 Fort L… Cocktail …      2                4 lime juice  .25 oz            0.25
##  9 Apple … Cordials …      3                1 apple schn… 3 oz              3   
## 10 Apple … Cordials …      3                2 cinnamon s… 1 oz              1   
## # … with 3,633 more rows
cocktails_parsed %>%
  count(ingredient, sort = TRUE)
## # A tibble: 440 x 2
##    ingredient       n
##    <chr>        <int>
##  1 lemon juice    311
##  2 lime juice     180
##  3 gin            176
##  4 bitters        166
##  5 orange juice   124
##  6 simple syrup   115
##  7 vodka          114
##  8 light rum      113
##  9 dry vermouth   107
## 10 triple sec     107
## # … with 430 more rows

We still have 440 ingredients, too many! Let’s filter for only ingredients that appear at least 15 times:

cocktails_parsed_filtered <- cocktails_parsed %>% # get a count of ingredients so we can filter out rare ones
  add_count(ingredient) %>% 
  filter(n > 15) %>%  # reduces no. ingredients from >400 down to 40
  select(-n) %>%
  distinct(row_id, ingredient, .keep_all = TRUE) %>%    # to stop repetitions e.g. if we had lemon slice and lemon juice in a recipe then we will end up with 2 'lemon's as per our mutate() calls earlier
  na.omit()

Exploratory Data Analysis

What are the msot common ingredients?

cocktails_parsed_filtered %>% 
        count(ingredient, sort = TRUE) %>% 
        head(20) %>% 
        mutate(ingredient = fct_reorder(ingredient, n)) %>% 
        ggplot(aes(n, ingredient)) + 
        geom_col() +
        labs(title = "Most common ingredients in Mr Boston recipes")

cocktails_parsed_filtered %>% 
        count(ingredient, sort = TRUE) %>% 
        head(20) %>% 
        mutate(ingredient = fct_reorder(ingredient, n)) %>% 
        ggplot(aes(x = ingredient, y = n)) + 
        geom_dotplot(binaxis = "y", stackdir = "center") +
        coord_flip() +
        labs(title = "Most common ingredients in Mr Boston recipes")

cocktails_parsed_filtered %>% 
          distinct(name, category) %>% 
        count(category, sort = TRUE)
## # A tibble: 11 x 2
##    category                  n
##    <chr>                 <int>
##  1 Cocktail Classics       428
##  2 Vodka                   137
##  3 Rum - Daiquiris         110
##  4 Whiskies                104
##  5 Tequila                  85
##  6 Brandy                   46
##  7 Gin                      17
##  8 Cordials and Liqueurs     6
##  9 Shooters                  2
## 10 Non-alcoholic Drinks      1
## 11 Rum                       1
n_recipes <- n_distinct(cocktails_parsed_filtered$name)

cocktails_parsed_filtered %>% 
        count(category, ingredient, sort = TRUE) %>% 
        mutate(category = fct_lump(category, 4),
                           ingredient = fct_lump(ingredient, 20)) %>% 
        filter(ingredient != "Other") %>% 
        mutate(ingredient = fct_reorder(ingredient, n, sum)) %>% 
        ggplot(aes(n/n_recipes, ingredient, fill = category)) + 
        geom_col() +
        scale_x_continuous(labels = percent_format()) +
        labs(title = "Most common ingredients in Mr Boston recipes",
             x = "% of all recipes",
             y = "Ingredient",
             fill = "Category")

Which ingredients tend to appear together in the same recipe?

library(widyr) 
library(tidytext)

ingredient_pairs <- cocktails_parsed_filtered %>% 
        add_count(ingredient) %>%   # like group_by then count
        filter(n >= 5) %>% # filter only for ingredients that appear in min. 10 recipes
        pairwise_cor(ingredient, name, sort = TRUE)

ingredient_pairs
## # A tibble: 1,560 x 3
##    item1          item2          correlation
##    <chr>          <chr>                <dbl>
##  1 whole egg      powdered sugar       0.360
##  2 powdered sugar whole egg            0.360
##  3 port           powdered sugar       0.278
##  4 powdered sugar port                 0.278
##  5 dry vermouth   gin                  0.274
##  6 gin            dry vermouth         0.274
##  7 egg yolk       powdered sugar       0.251
##  8 powdered sugar egg yolk             0.251
##  9 egg white      powdered sugar       0.215
## 10 powdered sugar egg white            0.215
## # … with 1,550 more rows

Which ingredients are most closely correlated with gin?:

ingredient_pairs %>% 
        filter(item1 == "gin") %>% 
        head(10) %>% 
        mutate(item2 = fct_reorder(item2, correlation)) %>% 
        ggplot(aes(correlation, item2)) +
        geom_col() +
        labs(title = "What ingredients are most correlated with gin?")

Note that dry vermouth is at the top - makes sense because dry vermouth + gin is a common base for martinis!

ingredient_pairs %>% 
        filter(item1 %in% c("gin", "vodka",  "bourbon whiskey", "lemon juice")) %>% 
        group_by(item1) %>% 
        top_n(8, correlation) %>% 
        ungroup() %>%   # wasn't included in David's script - need to include
        mutate(item2 = reorder_within(item2, correlation, item1)) %>% 
        ggplot(aes(correlation, item2)) +
        geom_col() +
        facet_wrap(~ item1, scales = "free_y") +
        scale_y_reordered() +
        labs(title = "What ingredients are most correlated with particular ingreidents?")

Unsupervised Learning

Clustering

ingredients_summarized <- cocktails_parsed_filtered %>%         
        group_by(name) %>% 
        mutate(percentile = row_number() / n()) %>%  # account for fact that some recipes have many ingredients and some don't
        ungroup() %>% 
        group_by(ingredient) %>% 
        summarize(n = n(),
                  n_with_measure = sum(!is.na(measure_number)),
                  avg_position = mean(percentile),
                  avg_serving = mean(measure_number, na.rm = TRUE)) %>% 
        arrange(desc(n)) 

#ingredients_summarized

library(ggraph)
library(igraph)
library(ggrepel)

top_cors <- ingredient_pairs %>% 
        head(70)

ingredient_info <- ingredients_summarized %>% 
        filter(ingredient %in% top_cors$item1) %>% 
        distinct()

# ~ 35 mins     
top_cors %>% 
        graph_from_data_frame(vertices = ingredient_info) %>% 
        ggraph(layout = "fr") +
        geom_edge_link() +
        geom_node_text(aes(label = name), repel = TRUE) +
        geom_node_point(aes(size = 1.1* n)) +        
        geom_node_point(aes(size = n, color = avg_position)) +
        scale_color_gradient2(low = "red", high = "blue", midpoint = .5,
                              labels = scales::percent_format()) +
        labs(size = "# of recipes",
             color = "Avg position in drink",
             title = "The universe of cocktail ingredients",
             subtitle = "Connected ingredients tend to appear in the same drink. Red ingredients are early in the recipe, \n blue ingredients tend to be later")

Dimensionality reduction

I typically do my data cleaning with data in a tidy format, like boston_cocktails or cocktails_parsed. When it’s time for modeling, we usually need the data in a wider format, so let’s use pivot_wider() to reshape our data.

cocktails_df <- cocktails_parsed_filtered %>%
  select(-ingredient_number, -row_id, -measure) %>% # remove unneeded columns
  pivot_wider(names_from = ingredient, values_from = measure_number, values_fill = 0) %>%
  janitor::clean_names() 
#  na.omit()

cocktails_df
## # A tibble: 937 x 42
##    name  category light_rum lemon_juice lime_juice sweet_vermouth orange_juice
##    <chr> <chr>        <dbl>       <dbl>      <dbl>          <dbl>        <dbl>
##  1 Gaug… Cocktai…      2           1          1               0           0   
##  2 Fort… Cocktai…      1.5         0          0.25            0.5         0.25
##  3 Cuba… Cocktai…      2           0          0.5             0           0   
##  4 Cool… Cocktai…      0           0          0               0           1   
##  5 John… Whiskies      0           1          0               0           0   
##  6 Cher… Cocktai…      1.25        0          0               0           0   
##  7 Casa… Cocktai…      2           0          1.5             0           0   
##  8 Cari… Cocktai…      0.5         0          0               0           0   
##  9 Ambe… Cordial…      0           0.25       0               0           0   
## 10 The … Whiskies      0           0.5        0               0           0   
## # … with 927 more rows, and 35 more variables: powdered_sugar <dbl>,
## #   dark_rum <dbl>, cranberry_juice <dbl>, pineapple_juice <dbl>,
## #   bourbon_whiskey <dbl>, simple_syrup <dbl>, cherry_flavored_brandy <dbl>,
## #   light_cream <dbl>, triple_sec <dbl>, maraschino <dbl>, amaretto <dbl>,
## #   grenadine <dbl>, apple_brandy <dbl>, brandy <dbl>, gin <dbl>,
## #   anisette <dbl>, dry_vermouth <dbl>, apricot_flavored_brandy <dbl>,
## #   bitters <dbl>, straight_rye_whiskey <dbl>, benedictine <dbl>,
## #   egg_white <dbl>, half_and_half <dbl>, vodka <dbl>, grapefruit_juice <dbl>,
## #   blended_scotch_whiskey <dbl>, port <dbl>, white_creme_de_cacao <dbl>,
## #   citrus_flavored_vodka <dbl>, whole_egg <dbl>, egg_yolk <dbl>,
## #   blended_whiskey <dbl>, dubonnet <dbl>, blanco_tequila <dbl>,
## #   old_mr_boston_dry_gin <dbl>

Principal component analysis

This dataset is especially delightful because we get to use recipes with recipes! Let’s load the tidymodels metapackage and implement principal component analysis with a recipe.

library(tidymodels)

pca_rec <- recipe(~., data = cocktails_df) %>%
  update_role(name, category, new_role = "id") %>%
  step_normalize(all_predictors()) %>%   # centers and scales all predictors
  step_pca(all_predictors())

pca_rec  # PCA hasn't actually been run yet - see next step
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##         id          2
##  predictor         40
## 
## Operations:
## 
## Centering and scaling for all_predictors
## No PCA components were extracted.

Now we actually run PCA:

pca_prep <- prep(pca_rec)  # takes the data and estimates the PCA values
#pca_prep

Let’s look at our results:

tidied_pca <- tidy(pca_prep, 2)
tidied_pca
## # A tibble: 1,600 x 4
##    terms             value component id       
##    <chr>             <dbl> <chr>     <chr>    
##  1 light_rum        0.163  PC1       pca_VFQSw
##  2 lemon_juice     -0.0140 PC1       pca_VFQSw
##  3 lime_juice       0.224  PC1       pca_VFQSw
##  4 sweet_vermouth  -0.0661 PC1       pca_VFQSw
##  5 orange_juice     0.0308 PC1       pca_VFQSw
##  6 powdered_sugar  -0.476  PC1       pca_VFQSw
##  7 dark_rum         0.124  PC1       pca_VFQSw
##  8 cranberry_juice  0.0954 PC1       pca_VFQSw
##  9 pineapple_juice  0.119  PC1       pca_VFQSw
## 10 bourbon_whiskey  0.0963 PC1       pca_VFQSw
## # … with 1,590 more rows
tidied_pca %>%
  filter(component %in% paste0("PC", 1:5)) %>%
  mutate(component = fct_inorder(component)) %>%
  ggplot(aes(value, terms, fill = terms)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~component, nrow = 1) +
  labs(y = NULL)

library(tidytext)

tidied_pca %>%
  filter(component %in% paste0("PC", 1:4)) %>%
  group_by(component) %>%
  top_n(8, abs(value)) %>%
  ungroup() %>%
  mutate(terms = reorder_within(terms, by = abs(value), within = component)) %>%  # from tidytext package - reorders column before plotting with faceting, such that the values are ordered within each facet...needs scale_x_reordered or scale_y_reordered to be used laster
  ggplot(aes(abs(value), terms, fill = value > 0)) +
  geom_col() +
  facet_wrap(~component, , scales = "free_y") +
  scale_y_reordered() +  # see earlier comment
  labs(
    x = "Absolute value of contribution",
    y = NULL, fill = "Positive?"
  )

So PC1 is about powdered sugar + egg + gin drinks vs. simple syrup + lime + tequila drinks. This is the component that explains the most variation in drinks. PC2 is mostly about vermouth, both sweet and dry.

How are the cocktails distributed in the plane of the first two components?

juice(pca_prep) %>%  # get info from processed dataset
  ggplot(aes(PC1, PC2, label = name)) +
  geom_point(aes(color = category), alpha = 0.7, size = 2) +
  geom_text(check_overlap = TRUE, hjust = "inward") +  # so that labels dont overlap
  labs(color = NULL)

  • Fizzy, eggy, powdered sugar drinks are to the left.
  • Simple syrup, lime, tequila drinks are to the right.
  • Vermouth drinks are more to the top.